name: Ingest Test Fixtures Update PR

on:
  workflow_dispatch:

env:
  PYTHON_VERSION: "3.10"

permissions:
  id-token: write
  contents: read

jobs:
  setup:
    runs-on: ubuntu-latest-m
    env:
      NLTK_DATA: ${{ github.workspace }}/nltk_data
    steps:
    - uses: actions/checkout@v3
    - uses: ./.github/actions/base-cache
      with:
        python-version: ${{ env.PYTHON_VERSION }}

  setup_ingest:
    runs-on: ubuntu-latest
    env:
      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup]
    steps:
      - uses: actions/checkout@v3
      - uses: ./.github/actions/base-ingest-cache
        with:
          python-version: ${{ env.PYTHON_VERSION }}
          check-only: 'true'

  update-fixtures-and-pr:
    runs-on: ubuntu-latest-m
    env:
      NLTK_DATA: ${{ github.workspace }}/nltk_data
    needs: [setup_ingest]
    steps:
      # actions/checkout MUST come before auth
      - uses: 'actions/checkout@v4'
      - name: Set up Python ${{ env.PYTHON_VERSION }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ env.PYTHON_VERSION }}
      - name: Get full Python version
        id: full-python-version
        run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
      - name: Setup virtual environment
        uses: ./.github/actions/base-ingest-cache
        with:
          python-version: ${{ env.PYTHON_VERSION }}
      - name: Setup docker-compose
        uses: KengoTODA/actions-setup-docker-compose@v1
        with:
          version: '2.22.0'
      - name: Update test fixtures
        env:
          AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }}
          BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
          CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }}
          CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }}
          DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
          DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
          DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
          DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
          GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
          GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
          HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }}
          JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }}
          JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }}
          MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
          MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
          MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
          MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
          MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
          SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}}
          SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}}
          SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}}
          SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
          SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
          SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
          SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
          SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
          SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
          SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
          UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
          NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
          AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          TABLE_OCR: "tesseract"
          OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
          OVERWRITE_FIXTURES: "true"
          CI: "true"
        run: |
          source .venv/bin/activate
          sudo apt-get update
          sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
          sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
          sudo apt-get install -y tesseract-ocr
          sudo apt-get install -y tesseract-ocr-kor
          tesseract --version
          ./test_unstructured_ingest/test-ingest-src.sh

      - name: Save branch name to environment file
        id: branch
        run: |
          original_branch=$(git rev-parse --abbrev-ref HEAD)
          suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)"
          branch_name="$original_branch$suffix"
          echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV

      - name: Save PR name to environment file
        id: pr
        run: |
          commit_sha=$(git rev-parse HEAD)
          prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
            "https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls")
          pr_name=$(echo "$prs" | jq -r '.[0].title')
          echo "PR_NAME=$pr_name" >> $GITHUB_ENV

      - name: Create Pull Request
        uses: peter-evans/create-pull-request@v5
        with:
          token: ${{ secrets.GH_CREATE_PR_TOKEN }}
          add-paths: |
            test_unstructured_ingest/expected-structured-output
            test_unstructured_ingest/metrics
          commit-message: "Update ingest test fixtures"
          branch: ${{ env.BRANCH_NAME }}
          title: "${{ env.PR_NAME }} <- Ingest test fixtures update"
          assignees: ${{ github.actor }}
          reviewers: ${{ github.actor }}
          delete-branch: true
          body: |
            This pull request includes updated ingest test fixtures.
            Please review and merge if appropriate.
          base: ${{ github.head_ref }}
